# base libraries
import pandas as pd
import numpy as np
# preprocessing libraries
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
# feature selection libraries
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import VarianceThreshold
# modelling libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
#model selection & tuning libraries
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
# visualisation libraries
import plotly.figure_factory as ff
from plotly import express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
# compute performance libraries
from joblib import Parallel, delayed
pd.set_option('display.max_columns',50)
pd.set_option('display.max_rows',100)
def nulsCount(df):
"""summarise missing/unexpected values"""
d2=pd.DataFrame(columns=["NULL","NAN","BLANKS","UNEXP"])
try:
d2["NULL"] = df.isnull().sum().astype('uint32') # check for null values
d2["NAN"]=df.isna().sum().astype('uint32') # check for NaN
d2["BLANKS"]=df.isin([""," "]).sum().astype('uint32') # check for blanks
d2["UNEXP"]=df.isin(["-","?",".","NA","N/A","Unknown"]).sum().astype('uint32') # check for other unexpected values
except:
pass
d2=d2.loc[(d2["NULL"]!=0) | (d2["NAN"]!=0) | (d2["BLANKS"]!=0) | (d2["UNEXP"]!=0)] # shortlist for the missing values
# convert to percentages
d2["NULL %"] = d2["NULL"].mul(100/df.shape[0]).round(2)
d2["NAN %"] = d2["NAN"].mul(100/df.shape[0]).round(2)
d2["BLANKS %"] = d2["BLANKS"].mul(100/df.shape[0]).round(2)
d2["UNEXP %"] = d2["UNEXP"].mul(100/df.shape[0]).round(2)
# rearrange
d2=d2[["NULL","NULL %","NAN","NAN %","BLANKS","BLANKS %","UNEXP","UNEXP %"]]
if d2.shape[0]==0:
return
else:
return d2
class SCFS():
"""https://www.frontiersin.org/articles/10.3389/fgene.2021.684100/full
Reference article for feature scoring
SCFS (Standard deviation and Cosine similarity based Feature Selection)
Credits to: Juanying Xie, Mingzhao Wang, Shengquan Xu, Zhao Huang and Philip W. Grant"""
def __init__(self,kind='exp'):
"""kind = {'exp','reciprocal','anti-similarity'} default='exp'"""
self.kind=kind
self.fitted=False
def discernibility(self):
"""list down the feature discernibility
same as sample standard deviations"""
m=self.df.shape[0]
self.dis=[np.sqrt(sum((self.df[i]-sum(self.df[i])/m)**2)/(m-1)) for i in self.df.columns]
self.dis=pd.Series(self.dis,index=self.df.columns,dtype=float)
def cosineSimilarity(self):
"""populate the cosine similarities (absolute)"""
self.cosdf=pd.DataFrame(columns=self.df.columns,index=self.df.columns)
for i in self.df.columns:
for j in self.df.columns:
norm_i=np.sqrt(self.df[i].dot(self.df[i]))
norm_j=np.sqrt(self.df[j].dot(self.df[j]))
self.cosdf.loc[i,j] = (np.abs(self.df[i].dot(self.df[j])))/(norm_i*norm_j)
def independence(self):
"""evaluate the feature independance"""
dismaxarg=self.dis.index[np.argmax(self.dis)]
self.ind=pd.Series(index=self.df.columns,dtype=float)
for i in self.df.columns:
if i == dismaxarg: # for feature with max stddev
if self.kind == 'exp':
self.ind[i] = np.exp(max(-self.cosdf.loc[i]))
elif self.kind == 'reciprocal':
self.ind[i] = max(1/self.cosdf.loc[i])
elif self.kind == 'anti-similarity':
self.ind[i] = max(1-self.cosdf.loc[i])
else:
if self.kind == 'exp':
self.ind[i] = np.exp(min(-self.cosdf.loc[i,self.dis[self.dis>self.dis[i]].index]))
elif self.kind == 'reciprocal':
self.ind[i] = min(1/self.cosdf.loc[i,self.dis[self.dis>self.dis[i]].index])
elif self.kind == 'anti-similarity':
self.ind[i] = min(1-self.cosdf.loc[i,self.dis[self.dis>self.dis[i]].index])
def fit(self,df):
"""evaluate feature scores of df"""
self.df=df.copy()
self.discernibility()
self.cosineSimilarity()
self.independence()
self.fscore=self.dis.mul(self.ind)
self.fitted=True
class SCFS_trim():
"""https://www.frontiersin.org/articles/10.3389/fgene.2021.684100/full
Reference article for feature scoring
SCFS (Standard deviation and Cosine similarity based Feature Selection)
Credits to: Juanying Xie, Mingzhao Wang, Shengquan Xu, Zhao Huang and Philip W. Grant"""
def __init__(self,thresh,fs):
"""thresh in the range of 0 to 100 percentile for trimming"""
self.thresh=thresh
self.fs=fs
def fit(self,df):
pass
def transform(self,df):
"""return features with score above threshold percentile"""
if self.fs.fitted:
self.df=df.copy()
return self.df[self.fs.fscore.loc[self.fs.fscore>=np.percentile(self.fs.fscore,self.thresh)].index]
else:
raise ValueError("Fit SCFS first")
# delete instance of global variable scoreLog
try:
del scoreLog
print("scoreLog deleted")
except:
print("scoreLog undefined")
# defining a function to report classification metrics
def reporter(Y_train, pred_train, Y_test, pred_test,model_name):
"""Classification report
logs test scores to global dataframe named scoreLog
the scoreLog (with any previous scores) will be displayed
also displays confusion matrices of current instance of arguments
---------------------------------------------------------------------------
Y_train ==> TRUE classes used for training (pandas series object or numpy array of 1-D)
pred_train ==> PREDICTION on training data (pandas series object or numpy array of 1-D)
Y_test ==> TRUE classes to be used for testing (pandas series object or numpy array of 1-D)
pred_test ==> PREDICTION on test data (pandas series object or numpy array of 1-D)
model_name ==> str name for current model, to be used as index for scoreLog
---------------------------------------------------------------------------
"""
from sklearn import metrics
import plotly.figure_factory as ff
import numpy as np
import pandas as pd
global scoreLog
classes=list(Y_test.unique())
cols=["accuracy"]
cols.extend(["precision_"+str(classes[i]) for i in range(len(classes))])
cols.extend(["recall_"+str(classes[i]) for i in range(len(classes))])
cols.extend(["fscore_"+str(classes[i]) for i in range(len(classes))])
try:
type(scoreLog)
except:
scoreLog=pd.DataFrame(columns=cols)
#metrics based on training set
#confusion matrix
z=pd.DataFrame(metrics.confusion_matrix(Y_train, pred_train))
fig1=ff.create_annotated_heatmap(np.array(z),annotation_text=np.array(z),
x=list(np.sort(np.unique(Y_train))),y=list(np.sort(np.unique(Y_train))),
colorscale='Mint',font_colors = ['grey','white'],name="TRAINING SET",
hovertemplate="Prediction: %{x:d}<br>True: %{y:d}<br>Count: %{z:d}")
fig1.update_layout(height=350,width=350)
fig1.update_xaxes(title_text="PREDICTED (TRAINING SET) - "+model_name)
fig1.update_yaxes(title_text="TRUE",tickangle=270)
#scores
score=[metrics.accuracy_score(Y_train,pred_train)]
score.extend(metrics.precision_score(Y_train,pred_train,labels=classes,average=None))
score.extend(metrics.recall_score(Y_train,pred_train,labels=classes,average=None))
score.extend(metrics.f1_score(Y_train,pred_train,labels=classes,average=None))
scoreLog=scoreLog.append(pd.DataFrame(score,index=cols,columns=[model_name+"_training"]).T)
#metrics based on test set
#confusion matrix
z=pd.DataFrame(metrics.confusion_matrix(Y_test, pred_test))
fig2=ff.create_annotated_heatmap(np.array(z),annotation_text=np.array(z),
x=list(np.sort(np.unique(Y_test))),y=list(np.sort(np.unique(Y_test))),
colorscale='Mint',font_colors = ['grey','white'],name="TEST SET",
hovertemplate="Prediction: %{x:d}<br>True: %{y:d}<br>Count: %{z:d}")
fig2.update_layout(height=350,width=350)
fig2.update_xaxes(title_text="PREDICTED (TEST SET) - "+model_name)
fig2.update_yaxes(title_text="TRUE",tickangle=270)
#scores
score=[metrics.accuracy_score(Y_test,pred_test)]
score.extend(metrics.precision_score(Y_test,pred_test,labels=classes,average=None))
score.extend(metrics.recall_score(Y_test,pred_test,labels=classes,average=None))
score.extend(metrics.f1_score(Y_test,pred_test,labels=classes,average=None))
scoreLog=scoreLog.append(pd.DataFrame(score,index=cols,columns=[model_name+"_test"]).T)
# merge both confusion matrix heatplots
fig=make_subplots(rows=1,cols=2,horizontal_spacing=0.05)
fig.add_trace(fig1.data[0],row=1,col=1)#,name="training data")
fig.add_trace(fig2.data[0],row=1,col=2)#,name="test data")
annot1 = list(fig1.layout.annotations)
annot2 = list(fig2.layout.annotations)
for k in range(len(annot2)):
annot2[k]['xref'] = 'x2'
annot2[k]['yref'] = 'y2'
fig.update_layout(annotations=annot1+annot2)
fig.layout.xaxis.update(fig1.layout.xaxis)
fig.layout.yaxis.update(fig1.layout.yaxis)
fig.layout.xaxis2.update(fig2.layout.xaxis)
fig.layout.yaxis2.update(fig2.layout.yaxis)
fig.layout.yaxis2.update({'title': {'text': ''}})
display(scoreLog)
fig.show()
scoreLog undefined
def dtc_pipe(X_train, X_test, Y_train, Y_test, mname):
"""basic model+predict+log cycle"""
# standardize
scl=StandardScaler()
X_train_std = pd.DataFrame(scl.fit_transform(X_train),columns=X_train.columns,index=X_train.index)
X_test_std = pd.DataFrame(scl.transform(X_test),columns=X_test.columns,index=X_test.index) # transform only
# balance training data set
balancer = SMOTE(sampling_strategy='not majority', random_state=129)
X_train_bal, Y_train_bal = balancer.fit_resample(X_train_std,Y_train)
# model learning
dtc=DecisionTreeClassifier(criterion = 'gini', max_depth = 3, random_state=1)
dtc.fit(X_train_bal,Y_train_bal)
# predict
pred_train=dtc.predict(X_train_std) # predict sufficient on imbalanced X
pred_test=dtc.predict(X_test_std)
# record scores
reporter(Y_train,pred_train,Y_test,pred_test,mname)
# generate reports (custom-built function : code in the begining of notebook)
class scfs_trim_dtc():
"""gridsearchable wrapper classifier based on DTC for SCFS_trim percentile tuning"""
def __init__(self,thresh,fs):
self.thresh=thresh
self.fs=fs
self.fitted=False
def fit(self,X_train,Y_train):
# SCFS trimming
self.fst=SCFS_trim(self.thresh,self.fs)
X_trim=self.fst.transform(X_train)
# standardize
self.scl=StandardScaler()
X_train_std = pd.DataFrame(self.scl.fit_transform(X_trim),columns=X_trim.columns,index=X_trim.index)
# model learning
self.dtc=DecisionTreeClassifier(criterion = 'gini', max_depth = 3, random_state=1)
self.dtc.fit(X_train_std,Y_train)
self.fitted=True
def predict(self,X_test):
if self.fitted:
X_test_trim=self.fst.transform(X_test)
X_test_std = pd.DataFrame(self.scl.transform(X_test_trim),columns=X_test_trim.columns,index=X_test_trim.index)
return self.dtc.predict(X_test_std)
else:
raise ValueError("Fit scfs_tri_dtc firstly")
def cvSplitter(X,Y,k=10,seed=100):
"""Splits K folds and returns array of copied dataframes"""
X=X.copy()
Y=Y.copy()
L=X.shape[0]
# seed pseudo random generator
np.random.seed(seed)
indices=np.random.choice(X.index,L,False)
sets=[(int(np.floor(L*(i)/k)),int(np.floor(L*(i+1)/k))) for i in range(k)]
Xtrains=[]
Xvals=[]
Ytrains=[]
Yvals=[]
ss=0
for i in range(k):
se=int(np.floor(L*(i+1)/k))
Xvals.append(X.loc[list(indices[ss:se])].copy())
Yvals.append(Y.loc[list(indices[ss:se])].copy())
Xtrains.append(X.loc[list(indices[[j not in indices[ss:se] for j in indices]])].copy())
Ytrains.append(Y.loc[list(indices[[j not in indices[ss:se] for j in indices]])].copy())
ss=se
return Xtrains,Ytrains,Xvals,Yvals
class remap():
def __init__(self):
"""performs skew correction and z-score standardisation"""
from sklearn.preprocessing import StandardScaler
self.fitted=False
def fit(self,df):
"""registers stats of the dataframe"""
df=df.copy()
self.fitting_info=pd.DataFrame(columns=["skew","kurt","min","max","reflect","r_min","r_max","mms","log","sqrt"],
index=df.columns)
# initialise flags
self.fitting_info["reflect"] = False
self.fitting_info["mms"] = False
self.fitting_info["log"] = False
self.fitting_info["sqrt"] = False
# reocird basic stats
self.fitting_info["skew"] = df.skew()
self.fitting_info["kurt"] = df.kurt()
self.fitting_info["min"] = df.min()
self.fitting_info["max"] = df.max()
# test need for reflected transforms
collist=list(self.fitting_info.loc[self.fitting_info["skew"]<=-0.75].index)
for col in collist:
# read basic stats
[cskew,cmin,cmax]=self.fitting_info.loc[col,["skew","min","max"]]
# reflect
temp_r = cmax+1-df[col]
cmin=temp_r.min()
cmax=temp_r.max()
self.fitting_info.loc[col,["r_min","r_max"]]=[cmin,cmax]
# scale between 0-500
temp_r_mms = (temp_r-cmin)*500/(cmax-cmin)
self.fitting_info.loc[col,["mms_min","mms_max"]]=[temp_r_mms.min(),temp_r_mms.max()]
# scaled log tranform
temp_r_mms_l = (temp_r_mms+1).apply(np.log)
# scaled sqrt tranform
temp_r_mms_s = temp_r_mms.apply(np.sqrt)
# plain log tranform
temp_r_l = (temp_r+1).apply(np.log)
# plain sqrt tranform
temp_r_s = temp_r.apply(np.sqrt)
# transformed skews
t_skew = np.abs([temp_r_l.skew(),temp_r_s.skew(),temp_r_mms_l.skew(),temp_r_mms_s.skew()])
# register flags
if round(min(t_skew),2)<round(abs(cskew),2):
self.fitting_info.loc[col,"reflect"]=True
if min(t_skew)==t_skew[0]:
self.fitting_info.loc[col,"log"]=True
df[col]=temp_r_l
elif min(t_skew)==t_skew[1]:
self.fitting_info.loc[col,"sqrt"]=True
df[col]=temp_r_s
elif min(t_skew)==t_skew[2]:
self.fitting_info.loc[col,["log","mms"]]=[True,True]
df[col]=temp_r_mms_l
elif min(t_skew)==t_skew[3]:
self.fitting_info.loc[col,["sqrt","mms"]]=[True,True]
df[col]=temp_r_mms_s
# test need for plain transforms
collist=list(self.fitting_info.loc[self.fitting_info["skew"]>=0.75].index)
for col in collist:
# read basic stats
[cskew,cmin,cmax]=self.fitting_info.loc[col,["skew","min","max"]]
# scale between 0-500
temp_mms = (df[col]-cmin)*500/(cmax-cmin)
self.fitting_info.loc[col,["mms_min","mms_max"]]=[temp_mms.min(),temp_mms.max()]
# scaled log tranform
temp_mms_l = (temp_mms+1).apply(np.log)
# scaled sqrt tranform
temp_mms_s = temp_mms.apply(np.sqrt)
# plain log tranform
temp_l = (df[col]+1).apply(np.log)
# plain sqrt tranform
temp_s = df[col].apply(np.sqrt)
# transformed skews
t_skew = np.abs([temp_l.skew(),temp_s.skew(),temp_mms_l.skew(),temp_mms_s.skew()])
# register flags
if round(min(t_skew),2)<round(abs(cskew),2):
if min(t_skew)==t_skew[0]:
self.fitting_info.loc[col,"log"]=True
df[col]=temp_l
elif min(t_skew)==t_skew[1]:
self.fitting_info.loc[col,"sqrt"]=True
df[col]=temp_s
elif min(t_skew)==t_skew[2]:
self.fitting_info.loc[col,["log","mms"]]=True
df[col]=temp_mms_l
elif min(t_skew)==t_skew[3]:
self.fitting_info.loc[col,["sqrt","mms"]]=[True,True]
df[col]=temp_mms_s
# z-score fitting
self.scl=StandardScaler()
self.scl.fit(df)
# set fitted flag
self.fitted=True
def transform(self,df):
"""perform transforms & scaling"""
if not self.fitted:
raise ValueError("please fit remap")
return
df=df.copy()
for col in df.columns:
# find min max value
cmin = self.fitting_info.loc[col,"min"]
cmax = self.fitting_info.loc[col,"max"]
# 1. reflection
if self.fitting_info.loc[col,"reflect"]:
temp = cmax+1-df[col]
df[col] = temp
# update min max
cmin = self.fitting_info.loc[col,"r_min"]
cmax = self.fitting_info.loc[col,"r_max"]
# 2. min max scaling for log / sqrt
if self.fitting_info.loc[col,"mms"]:
temp = (df[col]-cmin)*500/(cmax-cmin)
df[col] = temp
# update min max
cmin = self.fitting_info.loc[col,"mms_min"]
cmax = self.fitting_info.loc[col,"mms_max"]
# 3. shift data to +ve scale
if cmin<0:
df[col]=df[col]-cmin
if df[col].min()<0: # reconfirm
df[col]=df[col]-df[col].min()
# 4. log transform
if self.fitting_info.loc[col,"log"]:
df[col]=(df[col]+1).apply(np.log)
# 5. sqrt transform
if self.fitting_info.loc[col,"sqrt"]:
df[col]=df[col].apply(np.sqrt)
# 6. reverse Reflection
if self.fitting_info.loc[col,"reflect"]:
temp = np.log(cmax)+1-df[col]
df[col] = temp
# find skew
self.fitting_info.loc[col,"trans_skew"]=df[col].skew()
# 7. z-score transform
df=pd.DataFrame(self.scl.transform(df),columns=df.columns,index=df.index)
# find scaled skew
self.fitting_info["trans_scaled_skew"]=df.skew()
return df
def fit_transform(self,df):
"""fit, remap"""
self.fit(df)
df=self.transform(df)
return df
class pandaPoly():
"""PolynomialFeatures extraction and returns Pandas DataFrame"""
from sklearn.preprocessing import PolynomialFeatures
def __init__(self,degree=2, interaction_only=True):
self.poly = self.PolynomialFeatures(degree=2, interaction_only=True)
self.fitted=False
def fit(self,df):
self.poly.fit(df)
self.fitted=True
def transform(self,df):
if self.fitted:
df=df.copy()
d2=pd.DataFrame(self.poly.transform(df),index=df.index)
d2=pd.merge(df,d2,left_index=True,right_index=True)
return d2
else:
raise ValueError("please fit pandaPoly")
def fit_transform(self,df):
self.fit(df)
df=self.transform(df)
return df
class dummies:
"""to implement encoding without data leak"""
def __init__(self):
"""input : dataframe"""
self.ref={}
self.fitted=False
def fit(self,df):
"""Collect required encoding information"""
cat=list(df.select_dtypes(include='object').columns)
for col in cat:
unq=list(df[col].value_counts().index)
self.ref.update({col:unq})
self.fitted=True
return
def transform(self,df):
"""perform encoding"""
df=df.copy()
if not self.fitted:
raise ValueError("please fit first")
return
cat=list(self.ref.keys())
for col in cat:
unq=self.ref.get(col)
for i in unq:
df[col+"_"+str(i)]=df[col]
df.loc[df[col+"_"+str(i)]==i,[col+"_"+str(i)]]=1
df.loc[df[col+"_"+str(i)]!=1,[col+"_"+str(i)]]=0
df.drop(col,axis=1,inplace=True)
df.drop(col+"_"+str(unq[i]),axis=1,inplace=True) #drop_first=True
df = df.apply(pd.to_numeric,errors='ignore',downcast='float',axis=0)
return df
def fit_transform(self,df):
"""learn and encode"""
self.fit(df)
df=self.transform(df)
return df
class pandaCluster():
"""performs KMeans Clustering and returnd Pandas DataFrame with cluster encoded columns"""
def __init__(self,n_clusters=4): # 4 selected for simplicity
self.fitted=False
# models
self.scl = remap()
self.dum = dummies()
self.clt = KMeans(n_clusters=n_clusters)
def fit(self,df):
df=df.copy()
# scale incoming data
df= self.scl.fit_transform(df)
# cluster fitting
self.clt.fit(df)
# encoder fitting for clusters
pred=pd.DataFrame(self.clt.predict(df),columns=["CLUSTER"],index=df.index,dtype='object')
self.dum.fit(pred)
self.fitted=True
def transform(self,df):
if self.fitted:
df=df.copy()
dforig=df.copy()
# scale data
df= self.scl.transform(df)
# predict clusters
pred=pd.DataFrame(self.clt.predict(df),columns=["CLUSTER"],index=df.index,dtype='object')
# encode cluster columns
pred=self.dum.transform(pred)
# merge with source
df=pd.merge(dforig,pred,left_index=True,right_index=True)
return df
else:
raise ValueError("please fit pandaCluster")
def fit_transform(self,df):
self.fit(df)
df=self.transform(df)
return df
DOMAIN: Semiconductor manufacturing process
• CONTEXT:
A complex modern semiconductor manufacturing process is normally under constant surveillance via the monitoring of signals/variables collected from sensors and or process measurement points. However, not all of these signals are equally valuable in a specific monitoring system. The measured signals contain a combination of useful information, irrelevant information as well as noise. Engineers typically have a much larger number of signals than are actually required. If we consider each type of signal as a feature, then feature selection may be applied to identify the most relevant signals. The Process Engineers may then use these signals to determine key factors contributing to yield excursions downstream in the process. This will enable an increase in process throughput, decreased time to learning and reduce the per unit production costs. These signals can be used as features to predict the yield type. And by analysing and trying out different combinations of features, essential signals that are impacting the yield type can be identified.
• DATA DESCRIPTION: sensor-data.csv : (1567, 592)
The data consists of 1567 datapoints each with 591 features. The dataset presented in this case represents a selection of such features where each example represents a single production entity with associated measured features and the labels represent a simple pass/fail yield for in house line testing. Target column “ –1” corresponds to a pass and “1” corresponds to a fail and the data time stamp is for that speci ic test point.
• PROJECT OBJECTIVE:
We will build a classifier to predict the Pass/Fail yield of a particular process entity and analyse whether all the
features are required to build the model or not.
Steps and tasks:
1.Import and understand the data.
A. Import ‘signal-data.csv’ as DataFrame.
B. Print 5 point summary and share at least 2 observations.
# read the dataset
df=pd.read_csv("signal-data.csv")
df.head()
| Time | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | ... | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008-07-19 11:55:00 | 3030.93 | 2564.00 | 2187.7333 | 1411.1265 | 1.3602 | 100.0 | 97.6133 | 0.1242 | 1.5005 | 0.0162 | -0.0034 | 0.9455 | 202.4396 | 0.0 | 7.9558 | 414.8710 | 10.0433 | 0.9680 | 192.3963 | 12.5190 | 1.4026 | -5419.00 | 2916.50 | -4043.75 | ... | NaN | NaN | NaN | NaN | 533.8500 | 2.1113 | 8.95 | 0.3157 | 3.0624 | 0.1026 | 1.6765 | 14.9509 | NaN | NaN | NaN | NaN | 0.5005 | 0.0118 | 0.0035 | 2.3630 | NaN | NaN | NaN | NaN | -1 |
| 1 | 2008-07-19 12:32:00 | 3095.78 | 2465.14 | 2230.4222 | 1463.6606 | 0.8294 | 100.0 | 102.3433 | 0.1247 | 1.4966 | -0.0005 | -0.0148 | 0.9627 | 200.5470 | 0.0 | 10.1548 | 414.7347 | 9.2599 | 0.9701 | 191.2872 | 12.4608 | 1.3825 | -5441.50 | 2604.25 | -3498.75 | ... | NaN | NaN | NaN | NaN | 535.0164 | 2.4335 | 5.92 | 0.2653 | 2.0111 | 0.0772 | 1.1065 | 10.9003 | 0.0096 | 0.0201 | 0.0060 | 208.2045 | 0.5019 | 0.0223 | 0.0055 | 4.4447 | 0.0096 | 0.0201 | 0.0060 | 208.2045 | -1 |
| 2 | 2008-07-19 13:17:00 | 2932.61 | 2559.94 | 2186.4111 | 1698.0172 | 1.5102 | 100.0 | 95.4878 | 0.1241 | 1.4436 | 0.0041 | 0.0013 | 0.9615 | 202.0179 | 0.0 | 9.5157 | 416.7075 | 9.3144 | 0.9674 | 192.7035 | 12.5404 | 1.4123 | -5447.75 | 2701.75 | -4047.00 | ... | 0.4122 | 0.2562 | 0.4119 | 68.8489 | 535.0245 | 2.0293 | 11.21 | 0.1882 | 4.0923 | 0.0640 | 2.0952 | 9.2721 | 0.0584 | 0.0484 | 0.0148 | 82.8602 | 0.4958 | 0.0157 | 0.0039 | 3.1745 | 0.0584 | 0.0484 | 0.0148 | 82.8602 | 1 |
| 3 | 2008-07-19 14:43:00 | 2988.72 | 2479.90 | 2199.0333 | 909.7926 | 1.3204 | 100.0 | 104.2367 | 0.1217 | 1.4882 | -0.0124 | -0.0033 | 0.9629 | 201.8482 | 0.0 | 9.6052 | 422.2894 | 9.6924 | 0.9687 | 192.1557 | 12.4782 | 1.4011 | -5468.25 | 2648.25 | -4515.00 | ... | 3.5611 | 0.0670 | 2.7290 | 25.0363 | 530.5682 | 2.0253 | 9.33 | 0.1738 | 2.8971 | 0.0525 | 1.7585 | 8.5831 | 0.0202 | 0.0149 | 0.0044 | 73.8432 | 0.4990 | 0.0103 | 0.0025 | 2.0544 | 0.0202 | 0.0149 | 0.0044 | 73.8432 | -1 |
| 4 | 2008-07-19 15:22:00 | 3032.24 | 2502.87 | 2233.3667 | 1326.5200 | 1.5334 | 100.0 | 100.3967 | 0.1235 | 1.5031 | -0.0031 | -0.0072 | 0.9569 | 201.9424 | 0.0 | 10.5661 | 420.5925 | 10.3387 | 0.9735 | 191.6037 | 12.4735 | 1.3888 | -5476.25 | 2635.25 | -3987.50 | ... | NaN | NaN | NaN | NaN | 532.0155 | 2.0275 | 8.83 | 0.2224 | 3.1776 | 0.0706 | 1.6597 | 10.9698 | NaN | NaN | NaN | NaN | 0.4800 | 0.4766 | 0.1045 | 99.3032 | 0.0202 | 0.0149 | 0.0044 | 73.8432 | -1 |
5 rows × 592 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1567 entries, 0 to 1566 Columns: 592 entries, Time to Pass/Fail dtypes: float64(590), int64(1), object(1) memory usage: 7.1+ MB
df.select_dtypes(include='int64').columns
Index(['Pass/Fail'], dtype='object')
df.select_dtypes(include='object').columns
Index(['Time'], dtype='object')
df.select_dtypes(include='object').describe()
| Time | |
|---|---|
| count | 1567 |
| unique | 1534 |
| top | 2008-10-15 01:52:00 |
| freq | 3 |
every column is a numeric data except for Time column
later, lets see if we could extract features from Time column else drop it
Also the Time column seems to have duplicates, which could be the same with all the other columns too.
need to confirm to drop those.
# typecast to datetime
df.Time=pd.to_datetime(df.Time)
# print 5 point summary
df.describe(datetime_is_numeric=True)[-5:]
| Time | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | ... | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 25% | 2008-07-09 15:32:00 | 2966.260000 | 2452.247500 | 2181.044400 | 1081.87580 | 1.01770 | 100.0 | 97.920000 | 0.121100 | 1.411200 | -0.010800 | -0.005600 | 0.958100 | 198.130700 | 0.0 | 7.094875 | 406.127400 | 9.567625 | 0.968200 | 188.299825 | 12.460000 | 1.396500 | -5933.250000 | 2578.000000 | -4371.750000 | ... | 2.090200 | 0.038200 | 1.884400 | 15.466200 | 530.702700 | 1.982900 | 7.500000 | 0.242250 | 2.56785 | 0.075100 | 1.408450 | 11.501550 | 0.01380 | 0.01060 | 0.003400 | 46.184900 | 0.497900 | 0.01160 | 0.00310 | 2.306500 | 0.013425 | 0.010600 | 0.003300 | 44.368600 | -1.00000 |
| 50% | 2008-08-23 13:02:00 | 3011.490000 | 2499.405000 | 2201.066700 | 1285.21440 | 1.31680 | 100.0 | 101.512200 | 0.122400 | 1.461600 | -0.001300 | 0.000400 | 0.965800 | 199.535600 | 0.0 | 8.967000 | 412.219100 | 9.851750 | 0.972600 | 189.664200 | 12.499600 | 1.406000 | -5523.250000 | 2664.000000 | -3820.750000 | ... | 2.150450 | 0.048650 | 1.999700 | 16.988350 | 532.398200 | 2.118600 | 8.650000 | 0.293400 | 2.97580 | 0.089500 | 1.624500 | 13.817900 | 0.02040 | 0.01480 | 0.004700 | 72.288900 | 0.500200 | 0.01380 | 0.00360 | 2.757650 | 0.020500 | 0.014800 | 0.004600 | 71.900500 | -1.00000 |
| 75% | 2008-09-22 11:16:30 | 3056.650000 | 2538.822500 | 2218.055500 | 1591.22350 | 1.52570 | 100.0 | 104.586700 | 0.123800 | 1.516900 | 0.008400 | 0.005900 | 0.971300 | 202.007100 | 0.0 | 10.861875 | 419.089275 | 10.128175 | 0.976800 | 192.189375 | 12.547100 | 1.415000 | -5356.250000 | 2841.750000 | -3352.750000 | ... | 3.098725 | 0.075275 | 2.970850 | 24.772175 | 534.356400 | 2.290650 | 10.130000 | 0.366900 | 3.49250 | 0.112150 | 1.902000 | 17.080900 | 0.02770 | 0.02000 | 0.006475 | 116.539150 | 0.502375 | 0.01650 | 0.00410 | 3.295175 | 0.027600 | 0.020300 | 0.006400 | 114.749700 | -1.00000 |
| max | 2008-12-10 18:47:00 | 3356.350000 | 2846.440000 | 2315.266700 | 3715.04170 | 1114.53660 | 100.0 | 129.252200 | 0.128600 | 1.656400 | 0.074900 | 0.053000 | 0.984800 | 272.045100 | 0.0 | 19.546500 | 824.927100 | 102.867700 | 0.984800 | 215.597700 | 12.989800 | 1.453400 | 0.000000 | 3656.250000 | 2363.000000 | ... | 14.014100 | 0.293200 | 12.746200 | 84.802400 | 589.508200 | 2.739500 | 454.560000 | 2.196700 | 170.02040 | 0.550200 | 90.423500 | 96.960100 | 0.10280 | 0.07990 | 0.028600 | 737.304800 | 0.509800 | 0.47660 | 0.10450 | 99.303200 | 0.102800 | 0.079900 | 0.028600 | 737.304800 | 1.00000 |
| std | NaN | 73.621787 | 80.407705 | 29.513152 | 441.69164 | 56.35554 | 0.0 | 6.237214 | 0.008961 | 0.073897 | 0.015116 | 0.009302 | 0.012452 | 3.257276 | 0.0 | 2.796596 | 17.221095 | 2.403867 | 0.012062 | 2.781041 | 0.217965 | 0.016737 | 626.822178 | 295.498535 | 1380.162148 | ... | 1.032761 | 0.032761 | 0.996644 | 10.213294 | 17.499736 | 0.275112 | 86.304681 | 0.248478 | 26.92015 | 0.067791 | 16.921369 | 12.485267 | 0.01173 | 0.00964 | 0.003116 | 87.520966 | 0.003404 | 0.01718 | 0.00372 | 3.578033 | 0.012358 | 0.008808 | 0.002867 | 93.891919 | 0.49801 |
5 rows × 592 columns
There are few constant columns like "13","42",...
There are few extreme skewed or quasi-constant columns like "4","21"...
There are few near-perfect bell curves like "24"
Need to review and remove columns that doesn't add information to target In reference to the target, the dataset seems imbalanced as more than 75% of data corresponds to -1
2.Data cleansing:
A. Write a for loop which will remove all the features with 20%+ Null values and impute rest with mean of the feature.
B. Identify and drop the features which are having same value for all the rows.
# verify if target column has nans
df["Pass/Fail"].isna().sum()
0
safe to continue without dropping any records
%%time
#lets review the nulls
nulsCount(df)
#(custom-built function : code in the begining of notebook)
CPU times: user 284 ms, sys: 20.1 ms, total: 305 ms Wall time: 303 ms
| NULL | NULL % | NAN | NAN % | BLANKS | BLANKS % | UNEXP | UNEXP % | |
|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 0.38 | 6 | 0.38 | 0 | 0.0 | 0 | 0.0 |
| 1 | 7 | 0.45 | 7 | 0.45 | 0 | 0.0 | 0 | 0.0 |
| 2 | 14 | 0.89 | 14 | 0.89 | 0 | 0.0 | 0 | 0.0 |
| 3 | 14 | 0.89 | 14 | 0.89 | 0 | 0.0 | 0 | 0.0 |
| 4 | 14 | 0.89 | 14 | 0.89 | 0 | 0.0 | 0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 585 | 1 | 0.06 | 1 | 0.06 | 0 | 0.0 | 0 | 0.0 |
| 586 | 1 | 0.06 | 1 | 0.06 | 0 | 0.0 | 0 | 0.0 |
| 587 | 1 | 0.06 | 1 | 0.06 | 0 | 0.0 | 0 | 0.0 |
| 588 | 1 | 0.06 | 1 | 0.06 | 0 | 0.0 | 0 | 0.0 |
| 589 | 1 | 0.06 | 1 | 0.06 | 0 | 0.0 | 0 | 0.0 |
538 rows × 8 columns
%%time
# lets review least number of uniques in the features
df.nunique().sort_values()[:5]
CPU times: user 99.2 ms, sys: 0 ns, total: 99.2 ms Wall time: 108 ms
262 1 263 1 264 1 265 1 266 1 dtype: int64
# benchmark shape
df.shape
(1567, 592)
df_raw=df.copy()
%%time
for col in df.columns:
if df[col].nunique()==1: # features having same values for all rows
df.drop([col],axis=1,inplace=True)
elif df[col].isnull().sum()/df.shape[0]>0.2: # features with 20%+ Null values
df.drop([col],axis=1,inplace=True)
elif df[col].isnull().sum()>0: # features having at least 1 null
df[col].fillna(df[col].mean().astype('float32'),inplace=True)
CPU times: user 520 ms, sys: 11.7 ms, total: 532 ms Wall time: 530 ms
# review shape
df.shape
(1567, 444)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1567 entries, 0 to 1566 Columns: 444 entries, Time to Pass/Fail dtypes: datetime64[ns](1), float64(442), int64(1) memory usage: 5.3 MB
# review nulls
nulsCount(df)
none found, hence lets proceed
----------------------------------------------------------------------------
Let us set a base line model using DecisionTreeClassifier
# seperate predictors & targets
X = df[df.columns[1:-1]]
Y = df[df.columns[-1]]
# Train & Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.20, # split ratio of 80:20
random_state=129) # random seed
# model learning
dtc=DecisionTreeClassifier(criterion = 'gini', max_depth = 3, random_state=1)
dtc.fit(X_train,Y_train)
# predict
pred_train=dtc.predict(X_train)
pred_test=dtc.predict(X_test)
# record scores
reporter(Y_train,pred_train,Y_test,pred_test,"DTC_raw")
# generate reports (custom-built function : code in the begining of notebook)
| accuracy | precision_-1 | precision_1 | recall_-1 | recall_1 | fscore_-1 | fscore_1 | |
|---|---|---|---|---|---|---|---|
| DTC_raw_training | 0.950519 | 0.952730 | 0.846154 | 0.996590 | 0.275000 | 0.974167 | 0.415094 |
| DTC_raw_test | 0.914013 | 0.925566 | 0.200000 | 0.986207 | 0.041667 | 0.954925 | 0.068966 |
pretty impressive accuracy and low execution time
but unfortunately the precision, recall and f1_score for FAIL class (+1) is very poor
they are poor in training data prediction, probably because of imbalanced data
in the test data prediction, those have fallen even lower, indicating over-fit model
Lets build on our modelling
before proceeding further, lets extract some timestamp features, few polynomial features & inherent clusters
# benchmark
X_train.shape
(1253, 442)
# lets extract some features from the date time
X["Year"]=df.Time.dt.year
X["Mon"]=df.Time.dt.month
X["day"]=df.Time.dt.day
X["day_of_week"]=df.Time.dt.day_of_week
X["day_of_year"]=df.Time.dt.day_of_year
X["weekofyear"]=df.Time.dt.isocalendar().week
X["Hour"]=df.Time.dt.hour
X["Min"]=df.Time.dt.minute
X["Sec"]=df.Time.dt.second
X["Qtr"]=df.Time.dt.quarter
# Train & Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.20, # split ratio of 80:20
random_state=129) # random seed
# review
X_train.shape
(1253, 452)
%%time
# lets add few features about the inherent clusters in the dataset
clt=pandaCluster()
X_train_clt=clt.fit_transform(X_train)
X_test_clt=clt.transform(X_test)
CPU times: user 5.5 s, sys: 155 ms, total: 5.65 s Wall time: 3.36 s
# review
X_train.shape
(1253, 452)
%%time
#lets add some polynomial features to the data
poly=pandaPoly()
X_train_poly=poly.fit_transform(X_train_clt)
X_test_poly=poly.transform(X_test_clt)
CPU times: user 667 ms, sys: 265 ms, total: 932 ms Wall time: 933 ms
# review
X_train.shape
(1253, 452)
# review model performance
# custom pipe : code in the begining of notebook
dtc_pipe(X_train_poly, X_test_poly, Y_train, Y_test,"DTC2_date_clt_poly")
| accuracy | precision_-1 | precision_1 | recall_-1 | recall_1 | fscore_-1 | fscore_1 | |
|---|---|---|---|---|---|---|---|
| DTC_raw_training | 0.950519 | 0.952730 | 0.846154 | 0.996590 | 0.275000 | 0.974167 | 0.415094 |
| DTC_raw_test | 0.914013 | 0.925566 | 0.200000 | 0.986207 | 0.041667 | 0.954925 | 0.068966 |
| DTC2_date_clt_poly_training | 0.786113 | 0.982924 | 0.202532 | 0.785166 | 0.800000 | 0.872986 | 0.323232 |
| DTC2_date_clt_poly_test | 0.757962 | 0.928000 | 0.093750 | 0.800000 | 0.250000 | 0.859259 | 0.136364 |
2.Data cleansing:
C. Drop other features if required using relevant functional knowledge. Clearly justify the same.
# let us review spread of all features
stddev=pd.DataFrame(X_train_poly.std(),columns=["stddev"])
gdata=list(stddev.stddev)
fig = ff.create_distplot([gdata],['Stdandard Deviations'],
curve_type='kde',show_hist=True,
show_rug=True
)
fig.update_layout(height=500,width=1000,showlegend=False)
fig.show()
stddev.describe(percentiles=[0.1,0.2,0.25,0.3,0.4,0.5,0.6,0.7,0.75,0.8,0.9]).T
| count | mean | std | min | 10% | 20% | 25% | 30% | 40% | 50% | 60% | 70% | 75% | 80% | 90% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| stddev | 104196.0 | 22094.230647 | 525219.45057 | 0.0 | 0.004872 | 0.051158 | 0.123477 | 0.266408 | 1.128784 | 4.412979 | 17.15629 | 68.845767 | 151.635949 | 345.936963 | 2815.906138 | 7.287779e+07 |
the variances (or standard deviations) of several fearutes are condensed below unity
this indicates a that several features would not contribute to the model learning
though z-score tranformation will shift & rescale the distributions, it would also leverage all the noises in the data towards model learning
hence let us use few feature selection techniques to shrink our dataset
# let us remove any quasi-constant features
quasi = VarianceThreshold(threshold=0.02) #quasi-constant ness of 2%)
X_train_quasi=pd.DataFrame(quasi.fit_transform(X_train_poly),
columns=X_train_poly.columns[quasi.get_support()],index=X_train_poly.index)
X_test_quasi=pd.DataFrame(quasi.transform(X_test_poly),
columns=X_train_poly.columns[quasi.get_support()],index=X_test_poly.index)
# review model performance
# custom pipe : code in the begining of notebook
dtc_pipe(X_train_quasi, X_test_quasi, Y_train, Y_test,"DTC3_quasi")
| accuracy | precision_-1 | precision_1 | recall_-1 | recall_1 | fscore_-1 | fscore_1 | |
|---|---|---|---|---|---|---|---|
| DTC_raw_training | 0.950519 | 0.952730 | 0.846154 | 0.996590 | 0.275000 | 0.974167 | 0.415094 |
| DTC_raw_test | 0.914013 | 0.925566 | 0.200000 | 0.986207 | 0.041667 | 0.954925 | 0.068966 |
| DTC2_date_clt_poly_training | 0.786113 | 0.982924 | 0.202532 | 0.785166 | 0.800000 | 0.872986 | 0.323232 |
| DTC2_date_clt_poly_test | 0.757962 | 0.928000 | 0.093750 | 0.800000 | 0.250000 | 0.859259 | 0.136364 |
| DTC3_quasi_training | 0.802075 | 0.976313 | 0.202128 | 0.808184 | 0.712500 | 0.884328 | 0.314917 |
| DTC3_quasi_test | 0.713376 | 0.934783 | 0.107143 | 0.741379 | 0.375000 | 0.826923 | 0.166667 |
the f1 score and recall for FAIL class has improved
lets study further
using custom method based on published paper from
SCFS (Standard deviation and Cosine similarity based Feature Selection)
Reference article for feature scoring
https://www.frontiersin.org/articles/10.3389/fgene.2021.684100/full
Credits to: Juanying Xie, Mingzhao Wang, Shengquan Xu, Zhao Huang and Philip W. Grant
Explanation & Justification to use the method
The discernibility of a feature, refers to its distinguishable capability between categories
Feature selection aims to detect the features whose distinguishable capability is strong while the redundancy between them is less
To represent the redundancy between a feature and the other features, cosine similarity is used
Feature independence is deduced from cosine similarity ( in 3 possible ways)
The method guarantees that a feature will have the maximal independence as far as possible once it has the maximal discernibility
%%time
# custom class code written at the beginning of the notebook
scfs=SCFS(kind='exp')
# evaluate feature scores
scfs.fit(X_train_quasi)
# trim the datasets based on score & thresholds
# custom class code written at the beginning of the notebook
# separated tuning parameter for faster grid search
fs_trim=SCFS_trim(75,scfs)
X_trim1=fs_trim.transform(X_train_quasi)
# drop corresponding test features also
X_test_trim1=fs_trim.transform(X_test_quasi)
# review trimmed data shape
X_trim1.shape
--------------------------------------------------------------------------- MemoryError Traceback (most recent call last) <timed exec> in <module> <ipython-input-4-117ed22b0394> in fit(self, df) 52 53 self.discernibility() ---> 54 self.cosineSimilarity() 55 self.independence() 56 <ipython-input-4-117ed22b0394> in cosineSimilarity(self) 19 def cosineSimilarity(self): 20 """populate the cosine similarities (absolute)""" ---> 21 self.cosdf=pd.DataFrame(columns=self.df.columns,index=self.df.columns) 22 for i in self.df.columns: 23 for j in self.df.columns: ~/anaconda3/envs/data-science-stack-2.9.0/lib/python3.7/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy) 527 528 elif isinstance(data, dict): --> 529 mgr = init_dict(data, index, columns, dtype=dtype) 530 elif isinstance(data, ma.MaskedArray): 531 import numpy.ma.mrecords as mrecords ~/anaconda3/envs/data-science-stack-2.9.0/lib/python3.7/site-packages/pandas/core/internals/construction.py in init_dict(data, index, columns, dtype) 271 nan_dtype = dtype 272 val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype) --> 273 arrays.loc[missing] = [val] * missing.sum() 274 275 else: ~/anaconda3/envs/data-science-stack-2.9.0/lib/python3.7/site-packages/pandas/core/indexing.py in __setitem__(self, key, value) 690 691 iloc = self if self.name == "iloc" else self.obj.iloc --> 692 iloc._setitem_with_indexer(indexer, value, self.name) 693 694 def _validate_key(self, key, axis: int): ~/anaconda3/envs/data-science-stack-2.9.0/lib/python3.7/site-packages/pandas/core/indexing.py in _setitem_with_indexer(self, indexer, value, name) 1635 self._setitem_with_indexer_split_path(indexer, value, name) 1636 else: -> 1637 self._setitem_single_block(indexer, value, name) 1638 1639 def _setitem_with_indexer_split_path(self, indexer, value, name: str): ~/anaconda3/envs/data-science-stack-2.9.0/lib/python3.7/site-packages/pandas/core/indexing.py in _setitem_single_block(self, indexer, value, name) 1859 # actually do the set 1860 self.obj._consolidate_inplace() -> 1861 self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value) 1862 self.obj._maybe_update_cacher(clear=True) 1863 ~/anaconda3/envs/data-science-stack-2.9.0/lib/python3.7/site-packages/pandas/core/internals/managers.py in setitem(self, indexer, value) 566 567 def setitem(self, indexer, value) -> "BlockManager": --> 568 return self.apply("setitem", indexer=indexer, value=value) 569 570 def putmask(self, mask, new, align: bool = True, axis: int = 0): ~/anaconda3/envs/data-science-stack-2.9.0/lib/python3.7/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, ignore_failures, **kwargs) 425 applied = b.apply(f, **kwargs) 426 else: --> 427 applied = getattr(b, f)(**kwargs) 428 except (TypeError, NotImplementedError): 429 if not ignore_failures: ~/anaconda3/envs/data-science-stack-2.9.0/lib/python3.7/site-packages/pandas/core/internals/blocks.py in setitem(self, indexer, value) 1000 else: 1001 is_ea_value = False -> 1002 arr_value = np.array(value) 1003 1004 if transpose: MemoryError: Unable to allocate 44.5 GiB for an array with shape (77275, 77275) and data type object
# let us review spread of all features after SCFS
d1=X_trim1.select_dtypes(include='float')
stddev=pd.DataFrame(d1.std(),columns=["stddev"])
gdata=list(stddev.stddev)
fig = ff.create_distplot([gdata],['Stdandard Deviations'],
curve_type='kde',show_hist=True,
show_rug=True
)
fig.update_layout(height=500,width=1000,showlegend=False)
fig.show()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-47-18604d074bba> in <module> 1 # let us review spread of all features after SCFS ----> 2 d1=X_trim1.select_dtypes(include='float') 3 stddev=pd.DataFrame(d1.std(),columns=["stddev"]) 4 gdata=list(stddev.stddev) 5 fig = ff.create_distplot([gdata],['Stdandard Deviations'], NameError: name 'X_trim1' is not defined
stddev.describe(percentiles=[0.1,0.2,0.25,0.3,0.4,0.5,0.6,0.7,0.75,0.8,0.9]).T
the least standard deviation is above 5 units, which is remarkable improvement from original dataset
lets check our model learnability
# custom pipe : code in the begining of notebook
dtc_pipe(X_trim1, X_test_trim1, Y_train, Y_test,"DTC4_SCFS1")
the model has improved further, showcasing better recall & f1 scores in FAIL class
lets try to perform SCFS after standardisation
%%time
# custom class code written at the beginning of the notebook
scfs_std=SCFS(kind='exp')
# standardize
scl=StandardScaler()
X_train_std = pd.DataFrame(scl.fit_transform(X_train_quasi),columns=X_train_quasi.columns,index=X_train_quasi.index)
# evaluate feature scores
scfs_std.fit(X_train_std)
# trim the datasets based on score & thresholds
# custom class code written at the beginning of the notebook
# separated tuning parameter for faster grid search
fs_trim_std=SCFS_trim(75,scfs_std)
X_trim2=fs_trim_std.transform(X_train_quasi)
# sending non standardised data "X_train" with standardised learned fscore "scfs_std"
# the model pipe will perfrom standardisation again
# also, this method was built for those purposes of interchanging data sets with equivalent columns
# same functionality is already being used for trimming test dataset also.
# drop corresponding test features also
X_test_trim2=fs_trim_std.transform(X_test_quasi)
# review trimmed data shape
X_trim2.shape
# custom pipe : code in the begining of notebook
dtc_pipe(X_trim2, X_test_trim2, Y_train, Y_test,"DTC5_SCFS2_std")
a325rq24
this has improved the model learning better than the previous model with better f1 for FAIL class
this further justifies the use of the SCFS method
hence SCFS will be used after standardisation of the dataset
lets try tuning the threshold value
%%time
# obtain crossvalidation sets
Xtrains,Ytrains,Xvals,Yvals=cvSplitter(X_train_quasi,Y_train,k=10,seed=129)
# custom class code written at the beginning of the notebook
# define param grid
p_thresh = np.arange(0,101,5)
# create score log
scores=pd.DataFrame(columns=["thresh","trainf1","testf1"])
scores["thresh"]=p_thresh
# perform search
for i in range(len(Xtrains)):
for thresh in p_thresh:
scfs_clf=scfs_trim_dtc(thresh,scfs_std) # custom class code written at the beginning of the notebook
scfs_clf.fit(Xtrains[i],Ytrains[i])
tr_pred=scfs_clf.predict(Xtrains[i])
val_pred=scfs_clf.predict(Xvals[i])
scores.loc[scores["thresh"]==thresh,["trainf1"]]=metrics.f1_score(Ytrains[i],tr_pred,pos_label=1)
scores.loc[scores["thresh"]==thresh,["testf1"]]=metrics.f1_score(Yvals[i],val_pred,pos_label=1)
display(scores)
based on the above results, lets choose the threshold of 50% for we find best f1 score on test data.
fs_trim_std_tuned=SCFS_trim(50,scfs_std)
X_trim3=fs_trim_std_tuned.transform(X_train_quasi)
X_test_trim3=fs_trim_std_tuned.transform(X_test_quasi)
dtc_pipe(X_trim3, X_test_trim3, Y_train, Y_test,"DTC6_SCFS3_tuned")
tuning has not improved the FAIL class scores
lets move to futher optimising the features
-----------------------------------------------------------------------------------------------
by now, the following project statement have been covered and mentioned here to keep track
2.Data cleansing:
D. Check for multi-collinearity in the data and take necessary action.
# verify correlation in base data
((abs(X_train_quasi.corr())>0.75).sum().sum()-X_train_quasi.shape[1])/2
there are 485 cases of high correlation, indicating high multi-collinearity among the dataset
lets check on SCFS trimmed data
# verify correlation in SCFS data (60% thresh)
((abs(X_trim3.corr())>0.75).sum().sum()-X_trim3.shape[1])/2
interestingly there is only 4 cases of high correlation
lets investigate further, using Variance Inflation Factors
by definition, the variance inflation factor is a measure for the increase of the variance of the parameter estimates if an additional variable, given by exog_idx is added to the linear regression. It is a measure for multicollinearity of the design matrix, exog.
One recommendation is that if VIF is greater than 5, then the explanatory variable given by exog_idx is highly collinear with the other explanatory variables, and the parameter estimates will have large standard errors because of this.
hence features having VIF above 5 needs to be studied for dropping
%%time
#obtain vif for base data
cols=X_train_quasi.columns
vif = pd.DataFrame(index=cols)
vif["VIF"]=[variance_inflation_factor(X_train_quasi.values, i) for i in range(len(cols))]
# review the vif values
vif.describe().T
%%time
#obtain vif for SCFS data (60% thresh)
cols=X_trim3.columns
vif = pd.DataFrame(index=cols)
vif["VIF"]=[variance_inflation_factor(X_trim3.values, i) for i in range(len(cols))]
# review vif values
vif.describe().T
considering the two VIF distributions, it could be meaningful in starting with the SCFS dataset
As it has already taken cosine similarities into picture, most multi-collienar features would have been removed
%%time
# let is drop features for VIF > 5
X_vif_trim=X_trim3.copy()
while vif.max()[0]>5:
col=vif.index[np.argmax(vif["VIF"])] # select top vif column
X_vif_trim.drop(col,axis=1,inplace=True)
#recompute VIF
del vif
cols=X_vif_trim.columns
vif = pd.DataFrame(index=cols)
vif["VIF"]=[variance_inflation_factor(X_vif_trim.values, i) for i in range(len(cols))]
# lets review the final vif
vif.describe().T
X_vif_trim.shape
succesfully features of high VIF have been dropped
leaving behind 52 features
# verify correlation in vif trimmed data
((abs(X_vif_trim.corr())>0.75).sum().sum()-X_vif_trim.shape[1])/2
as expected, there are no cases of multi-collinearity, shown by no correlation above 0.75
# lets study the model performance
#test data selection
X_vif_test = X_test_trim3[X_vif_trim.columns]
dtc_pipe(X_vif_trim, X_vif_test, Y_train, Y_test,"DTC_4_vif")
the FAIL class scores has improved slighty in terms of recall
# let us review spread of all features after VIF trim
d1=X_vif_trim
stddev=pd.DataFrame(d1.std(),columns=["stddev"])
gdata=list(stddev.stddev)
fig = ff.create_distplot([gdata],['Stdandard Deviations'],
curve_type='kde',show_hist=True,
show_rug=True
)
fig.update_layout(height=500,width=1000,showlegend=False)
fig.show()
still the poor variance features are still available. lets try the 75% SCFS data
%%time
#obtain vif for SCFS data (75% thresh)
cols=X_trim2.columns
vif = pd.DataFrame(index=cols)
vif["VIF"]=[variance_inflation_factor(X_trim2.values, i) for i in range(len(cols))]
# let is drop features for VIF > 5
X_vif_trim2=X_trim2.copy()
while vif.max()[0]>5:
col=vif.index[np.argmax(vif["VIF"])] # select top vif column
X_vif_trim2.drop(col,axis=1,inplace=True)
#recompute VIF
del vif
cols=X_vif_trim2.columns
vif = pd.DataFrame(index=cols)
vif["VIF"]=[variance_inflation_factor(X_vif_trim2.values, i) for i in range(len(cols))]
X_vif_trim2.shape
this leaves us with only 40 features
# lets study the model performance
#test data selection
X_vif_test2 = X_test_trim2[X_vif_trim2.columns]
dtc_pipe(X_vif_trim2, X_vif_test2, Y_train, Y_test,"DTC_5_vif2")
this reduces our model performance, hence we shall stick with 60% timmed SCFS data optimised with VIF (DTC_4)
2.Data cleansing:
E. Make all relevant modifications on the data using both functional/logical reasoning/assumptions.
# lets try skew corrections in the data
rmp=remap()
# custom class code written at the beginning of the notebook
X_train_rmp=rmp.fit_transform(X_vif_trim)
X_test_rmp=rmp.transform(X_vif_test)
# lets study the model performance
dtc_pipe(X_train_rmp, X_test_rmp, Y_train, Y_test,"DTC_6_remapped")
skew correction has helped in terms of FAIL class scores and accuracy.
apart from above skew correction,
timestamp feature extraction
polynomial feature axtraction and
were performed earlier in the notebook
# let us put it back to the